
def load_smiles_to_set(file_path):
    #将数据读取并存放于集合中
    smiles_set=set()
    with open(file_path, 'r') as f:
        for line in f:
            smiles=line.strip()
            if smiles:
                smiles_set.add(smiles)
    return smiles_set

def load_generated_mol(file_path):
    smile_list=list()
    with open(file_path, 'r') as f:
        for line in f:
            smiles=line.strip()
            if smiles:
                smile_list.append(smiles)
    return smile_list

generated_file='./molecules_3_large_effective.txt'
generated_smiles=load_generated_mol(generated_file)
test_file='./train.txt'
test_smiles=load_smiles_to_set(test_file) #set类型的test_smiles数据
novel_mol=list()
for i in generated_smiles:
    if i in test_smiles:
        continue
    novel_mol.append(i)

novelty=float(len(novel_mol)/len(generated_smiles))*100
print("novelty:%.2f%%"%novelty)
